# Keep things nice and tidy, all libraries go here
library(readxl)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(svglite)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(ggsci)
data <- read_excel("data_IEEE.xlsx", skip = 1)
## New names:
## * `` -> ...35
data <- data %>% filter(is.na(Exclude))
# Years without any publication (for easy slicing)
years_no_publications <- c("1974", "1975", "1976", "1978")
# LABELS so slicing will not become a mess
swebok_areas_labels = c("SR", "SD", "SC", "ST", "SM", "SCM", "SEM", "SEP", "SEMM", "SQ", "SEPP", "SEE", "CF", "MF", "EF")
swebok_areas_labels_no_foundation = c("SR", "SD", "SC", "ST", "SM", "SCM", "SEM", "SEP", "SEMM", "SQ", "SEPP", "SEE")
swebok_areas_labels_long = c("Requirements", "Design", "Construction", "Testing", "Maintainance", "Config. Mgmt.", "SE Mgmt.", "SE Processes", "SE Models&Methods", "Software Quality", "SE Prof. Practice", "SE Economics")
cognitive_concepts_labels <- c( "Attention", "Selective attention", "Divided attention", "Sustained attention", "Memory", "Working memory", "Short-term memory", "Long-term memory", "Cognitive load", "Intrinsic CL", "Extrinsic CL", "Perception", "Problem solving", "Reasoning", "Decision making", "Cognitive biases", "Knowledge", "Explicit knowledge", "Tacit knowledge", "Techn. tacit knowl.", "Cogn. tacit knowl.", "Cognitive control", "Social Cognition")
measures_labels <- c("Qualit. measures", "Fieldwork", "Interview", "Task-based", "Open observation", "Quantit. measures", "Task performance", "Physiological meas.", "Subjective ratings", "Behavioral meas.")
# COLORS
tol9qualitative=c("#332288", "#88CCEE", "#44AA99", "#117733", "#999933", "#DDCC77", "#CC6677", "#882255", "#AA4499")
NPG_modified=c("#F5E144", "#4DBBD5FF", "#00A087FF", "#3C5488FF", "#F39B7FFF", "#8491B4FF", "#91D1C2FF", "#DC0000FF", "#7E6148FF")
# Necessary for groupying by high-level category
add_high_level_concepts_to_data = function(data) {data %>% mutate(Concept = case_when(Taxonomy %in% c("Attention", "Selective attention", "Divided attention", "Sustained attention") ~ "Attention",
Taxonomy %in% c("Memory", "Working memory", "Short-term memory", "Long-term memory") ~ "Memory",
Taxonomy %in% c("Cognitive load", "Extrinsic CL", "Intrinsic CL") ~ "Cognitive load",
Taxonomy == "Perception" ~ "Perception",
Taxonomy %in% c("Problem solving", "Reasoning", "Decision making") ~ "Reasoning",
Taxonomy %in% c("Cognitive biases") ~ "Cognitive biases",
Taxonomy %in% c("Knowledge", "Explicit knowledge", "Tacit knowledge", "Techn. tacit knowl.", "Cogn. tacit knowl.") ~ "Knowledge",
Taxonomy %in% c("Cognitive control") ~ "Cognitive control",
Taxonomy=="Social Cognition" ~ "Social cognition")) }
ggplot(data, aes(x=as.factor(Year))) +
geom_bar() +
ylab("Number of publications") +
xlab("Year") +
geom_text(stat='count', aes(label=..count..), vjust=2, color="white", size = 2.5) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("yearly_distribution.eps")
## Saving 7 x 5 in image
# Cleaning not needed values
data<-data %>%
mutate(Academia = replace(Academia, Academia == "?", NA)) %>%
mutate(Industry = replace(Industry, Industry == "?", NA))
data<-data %>%
mutate(Type = case_when(is.na(Academia) & is.na(Industry) ~ "None",
Academia == "1" & is.na(Industry) ~ "Academia",
Industry == "1" & is.na(Academia) ~ "Industry",
TRUE ~ "Both"))
data %>%
mutate(Type = fct_infreq(Type, ordered = T)) %>%
ggplot(aes(x=Type)) +
geom_bar(width = .5) +
xlab("Type of publication") +
ylab("Number of publications") +
geom_text(stat='count', aes(label=..count..), vjust=3, color="white", size = 4) +
theme_bw()
ggsave("academia_industry_distribution.pdf")
## Saving 7 x 5 in image
A publication can be in more than one category at the same time.
data %>%
select(all_of(swebok_areas_labels)) %>% # selecting columns corresponding to the SWEBoK Areas
mutate_all(replace_na,0) %>%
summarise_all(sum) %>%
gather(key = "SWEBOKArea", value = "publications", 1:15) %>%
arrange(-publications) %>%
mutate(SWEBOKArea = factor(SWEBOKArea, SWEBOKArea)) %>%
ggplot(aes(x=SWEBOKArea, y=publications)) +
geom_bar(stat="identity") +
geom_text(aes(label=publications), vjust=-0.3, color="black", size = 4) +
xlab("SWEBoK Area") +
ylab("Number of publications") +
theme_bw()
ggsave("swebok_distribution.pdf")
## Saving 7 x 5 in image
swebokareas<-data %>%
select(all_of(swebok_areas_labels)) %>% # selecting columns corresponding to the SWEBoK Areas
mutate_all(replace_na,0) %>%
as.matrix() %>%
crossprod()
swebokareas %>%
kable()
| SR | SD | SC | ST | SM | SCM | SEM | SEP | SEMM | SQ | SEPP | SEE | CF | MF | EF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SR | 49 | 18 | 5 | 2 | 4 | 0 | 7 | 2 | 4 | 0 | 7 | 0 | 0 | 0 | 1 |
| SD | 18 | 66 | 17 | 3 | 4 | 0 | 6 | 2 | 6 | 1 | 6 | 0 | 0 | 0 | 1 |
| SC | 5 | 17 | 77 | 5 | 22 | 1 | 3 | 2 | 2 | 0 | 3 | 0 | 0 | 0 | 0 |
| ST | 2 | 3 | 5 | 12 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| SM | 4 | 4 | 22 | 4 | 46 | 1 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| SCM | 0 | 0 | 1 | 0 | 1 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| SEM | 7 | 6 | 3 | 1 | 2 | 0 | 26 | 3 | 1 | 0 | 7 | 3 | 0 | 0 | 1 |
| SEP | 2 | 2 | 2 | 0 | 1 | 1 | 3 | 10 | 0 | 0 | 2 | 1 | 0 | 0 | 0 |
| SEMM | 4 | 6 | 2 | 0 | 0 | 0 | 1 | 0 | 8 | 0 | 1 | 0 | 0 | 0 | 0 |
| SQ | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 0 | 0 | 0 | 0 | 0 |
| SEPP | 7 | 6 | 3 | 0 | 1 | 0 | 7 | 2 | 1 | 0 | 18 | 3 | 0 | 0 | 1 |
| SEE | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 3 | 5 | 0 | 0 | 0 |
| CF | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| MF | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| EF | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
plot_ly(x=swebok_areas_labels, y=swebok_areas_labels, z=swebokareas, type="heatmap")
x <- data %>% select(all_of(swebok_areas_labels), all_of(cognitive_concepts_labels)) %>%
mutate_all(replace_na, 0) %>%
mutate(`Problem solving`, `Problem solving` = as.numeric(`Problem solving`)) %>%
gather(key="SWEBOK", value = pubs, swebok_areas_labels) %>% # use SWEBOK area as factor
filter(pubs > 0) %>% # select areas for which there are publications
group_by(SWEBOK) %>%
summarise_all(sum) %>% # number of publication for each area
select(-pubs) %>% # remove pubs to reuse it later
gather(key = "Taxonomy", value = "count", cognitive_concepts_labels) %>% # count publications in each cognitive taxonomy area
mutate(label = str_replace(as.character(count), "^0", "")) # add label for later
## Warning: NAs introduced by coercion
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(swebok_areas_labels)` instead of `swebok_areas_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cognitive_concepts_labels)` instead of `cognitive_concepts_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# Bubble plot
x <- arrange(x, Taxonomy)
xf<-x$Taxonomy
xfu<-unique(xf)
x$Taxonomy<-factor(xf,levels=xfu)
p<-ggplot(x)
p + geom_point(aes(x = fct_infreq(SWEBOK), y = fct_rev(Taxonomy), size=count), shape=21, fill="white", alpha=0.60) +
geom_text(aes(x = fct_infreq(SWEBOK), y = fct_rev(Taxonomy), label=label), size=2) +
theme(axis.text.x = element_text(angle = 45, hjust = 1.1, size=9,colour="black"), axis.text.y = element_text(size=8,colour="black"), axis.title.x = element_text(size=10), axis.title.y = element_text(size=10,colour = "black",vjust=0.12), panel.grid.major = element_line(linetype = "dashed", size=0.1, color="black"))+
labs(x="SWEBOK Area",y = "Taxonomy Area") + theme_bw()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).
ggsave("swebok_taxonomy_bubble.pdf")
## Saving 7 x 5 in image
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).
# Preparing the dataset for analysing the research methods
data<-data %>%
mutate(Quantitative = case_when(`Quantit. measures` == 1 | `Task performance` == 1 | `Physiological meas.` == 1 | `Subjective ratings` == 1 | `Behavioral meas.` == 1 ~ 1)) %>%
mutate(Quantitative = replace_na(Quantitative, 0)) %>%
mutate(Qualitative = case_when(Fieldwork == 1 | Interview == 1 | `Qualit. measures` == 1 | `Task-based` == 1 | `Open observation` == 1 ~ 1)) %>%
mutate(Qualitative = replace_na(Qualitative, 0)) %>%
mutate(Both = if_else(Qualitative == 1 & Quantitative == 1, 1, 0))
Number of publications per year according to SWEBOK areas
# Creating a temp dataset with missing publications years (i.e., year for which there was no publication)
data %>%
filter(is.na(Exclude)) %>%
select(c(Year, SR:EF)) %>%
gather("SWEBOK", "publications", 2:16) %>%
mutate_all(replace_na, 0) %>%
group_by(Year,SWEBOK) %>%
summarise(total=sum(publications)) %>%
ggplot(aes(x=as.factor(Year), fill=SWEBOK, y=total)) + geom_bar(stat="sum") +
xlab("Year") + ylab("Publications") + scale_fill_discrete(name = "SWEBOK Areas") + guides(size = F) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))
ggsave("years_swebok.pdf")
## Saving 7 x 5 in image
data <- data %>% complete(Year=seq(1973,2016))
data <- data %>%
mutate(research_method = if_else(Both==1, "Mixed", if_else(Qualitative==1, "Qualitative", "Quantitative")))
data %>% ggplot(aes(x=as.factor(Year), fill=research_method)) + geom_bar() +
scale_fill_discrete(name="Research method", labels = c("Mixed", "Qualitative", "Quantitative", ""), na.value = "transparent") +
xlab("Year") + ylab("Publications") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))
ggsave("years_researchmethods.pdf")
## Saving 7 x 5 in image
data.swebok.researchmethod <- data %>%
select(swebok_areas_labels, research_method) %>%
mutate_all(replace_na,0) %>%
group_by(research_method) %>%
summarise_at(vars(swebok_areas_labels), sum) %>%
gather("SWEBOK", "Publications", swebok_areas_labels)
data.swebok.researchmethod %>%
ggplot(aes(x=reorder(SWEBOK, Publications, function(x){sum(x)}), y=Publications, fill=research_method)) + geom_bar(stat = "identity") +
coord_flip() + xlab("SWEBOK areas") + scale_fill_discrete(name = "Research method")
ggsave("SWBOK_researchmethods.pdf")
## Saving 7 x 5 in image
data %>%
filter(!is.na(Identifier)) %>%
select(Identifier, all_of(cognitive_concepts_labels), measures_labels) %>%
gather(Taxonomy, value, all_of(cognitive_concepts_labels)) %>%
filter(!is.na(value)) %>%
select(-value) %>%
gather(Method, value, measures_labels) %>%
filter(!is.na(value)) %>%
arrange(Identifier) %>%
select(-Identifier, -value) %>%
group_by(Taxonomy, Method) %>%
tally(name = "Amount") %>%
ggplot(aes(x=Method, y=Taxonomy, fill=Amount)) +
geom_point(aes(size=Amount), alpha=0.5) +
theme(legend.position = "") + theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(measures_labels)` instead of `measures_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
ggsave("taxonomy_methods.pdf")
## Saving 7 x 5 in image
data %>%
select(all_of(swebok_areas_labels_no_foundation), all_of(cognitive_concepts_labels)) %>%
mutate_all(replace_na,0) %>%
gather(Taxonomy, value2, cognitive_concepts_labels) %>%
add_high_level_concepts_to_data() %>%
gather(SWEBOK, value, swebok_areas_labels_no_foundation) %>%
count(SWEBOK, Concept, value, value2) %>%
mutate(freq=ifelse(value==1 & value2==1, n, 0)) %>%
distinct(SWEBOK, Concept, freq) %>%
group_by(SWEBOK, Concept) %>%
summarize(total=sum(freq)) %>%
ungroup() %>%
ggplot(aes(fct_relevel(SWEBOK, swebok_areas_labels_no_foundation), fct_rev(Concept), fill=total)) +
geom_tile() + scale_fill_continuous(low="#fff9f7", high="red") +
xlab("SWEBOK area") + ylab("Concept") + guides(fill=guide_legend(title="")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8)) +
scale_x_discrete(labels = swebok_areas_labels_long)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(swebok_areas_labels_no_foundation)` instead of `swebok_areas_labels_no_foundation` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
ggsave("taxomony_swebok_cooccurences.pdf")
## Saving 7 x 5 in image
data %>%
select(cognitive_concepts_labels, measures_labels) %>%
mutate_all(replace_na,0) %>%
gather(Taxonomy, value, cognitive_concepts_labels) %>%
add_high_level_concepts_to_data() %>%
gather(Method, value2, measures_labels) %>%
count(Concept, Method, value, value2) %>%
mutate(freq=ifelse(value==1 & value2==1, n, 0)) %>%
ggplot(aes(fct_relevel(Method, measures_labels), fct_rev(Concept), fill=freq)) +
geom_tile() +
geom_vline(xintercept = 5.5, size=0.5, color="darkgrey") +
xlab("Assessment procedure") + ylab("Concept") + guides(fill=guide_legend(title="")) +
scale_x_discrete(labels=c("Fieldwork", "Interview", "Task-based", "Open observation", "Others", "Task performance", "Physiological meas.", "Subjective ratings", "Behavioral meas.", "Others")) + # not using measure_lables here since we need a catch-all "Others" category
annotate(geom="text", x=8, y=0.73, label="Quantitative", size=3, alpha=0.4)+ annotate(geom="text", x=3, y=0.73, label="Qualitative", size=3, alpha=0.4) +
scale_fill_continuous(low="#fff9f7", high="darkgreen") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))
ggsave("taxonomy_method_cooccurences.pdf")
## Saving 7 x 5 in image
data %>%
select(Year, cognitive_concepts_labels)%>%
gather("Taxonomy", "publications", cognitive_concepts_labels) %>%
mutate_all(replace_na,0) %>%
mutate(publications=as.integer(publications)) %>%
group_by(Year, Taxonomy) %>%
summarise(total=sum(publications)) %>%
ggplot(aes(as.factor(Year), total, fill=Taxonomy)) + geom_bar(stat="sum") + xlab("Year") + ylab("Publications") +
scale_fill_discrete(name = "Taxonomy Areas") + guides(size = F) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))
## Warning: NAs introduced by coercion
## Warning: Removed 1 rows containing non-finite values (stat_sum).
df.taxonomy <- data %>%
select(Year, all_of(cognitive_concepts_labels)) %>%
gather("Taxonomy", "publications", cognitive_concepts_labels) %>%
mutate_all(replace_na,0) %>%
mutate(publications=as.integer(publications)) %>% # for some reseason recognized as char
filter(publications>0)
## Warning: NAs introduced by coercion
# need to create a separated df to hold the percentage of publications within each year
data.percentage <- df.taxonomy %>%
group_by(Year) %>%
count(Taxonomy) %>%
mutate(ratio = scales::percent(n/sum(n)))
df.taxonomy %>%
ggplot(aes(x = as.factor(Year), fill = as.factor(Taxonomy))) +
geom_bar(position="fill") +
geom_text(data = data.percentage, aes(y = n,label = ratio), position = position_fill(vjust = 0.5), colour = "white", size = 1.3) +
xlab("Year") + ylab("Publications %") +
scale_fill_discrete(name = "Topic") + guides(size = F) +
scale_y_continuous(labels = percent) +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
theme(legend.key.size = unit(.2, "cm"), legend.key.width = unit(0.2,"cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 6))
ggsave("taxonomy_years.pdf", width = unit(9, "inch"), height = unit(6.5, "inch"))
df.concepts <- df.taxonomy %>%
add_high_level_concepts_to_data()
df.years <- data %>% filter(!(Year %in% years_no_publications)) %>% count(Year) # years without publications
ggplot() +
geom_bar(data=df.concepts, aes(x=as.factor(Year), fill=Concept), position="fill") +
geom_line(data=df.years, aes(x=as.factor(Year), y=n/max(n), group=1), size=0.8) +
geom_point(data=df.years, aes(x=as.factor(Year), y=n/max(n), group=1)) +
scale_y_continuous(labels = function(x)x*100, name="Publication %", sec.axis = sec_axis(name="Total publications", ~. * max(df.years$n), breaks=scales::breaks_extended(10))) +
xlab("Year") +
theme(panel.background = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_text(margin = margin(-15,0,0,0, "pt")), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, vjust = 2.4)) +
scale_fill_manual(values = NPG_modified)
ggsave("concepts_years.pdf", width = unit(13, "inch"), height = unit(6.5, "inch"))